%!TEX root = ../thesis.tex
% ******************************* Thesis Appendix A ****************************
\chapter{Experiment Specifications} 

\section*{Bayesian Settings}

\subsection{Image Recognition}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}[c]{c | c} 
     \hline
     variable & value \\ [0.5ex] 
     \hline
     learning rate &  0.001\\ 
     
     epochs & 100 \\
     
     batch size & 256 \\
     
     sample size & 10-25 \\
     
     loss & cross-entropy \\
     
     $(\alpha \mu^2)_{init}$ of approximate posterior $q_{\theta}(w|\mathcal{D})$ & -10 \\
     
     optimizer & Adam \cite{kingma2014adam} \\
     
     $\lambda$ in $\ell$-2 normalisation & 0.0005 \\
    
     $\beta_i$ & $\frac{2^{M-i}}{2^M-1}$ \cite{blundell2015weight} \\ [1ex] 
     \hline
    \end{tabular} 
    \renewcommand{\arraystretch}{2}
\end{table}

Sample size can vary from 10 to 25 as this range provided the best results. However, it can be played around with. For most of our experiments, it is either 10 or 25 unless specified otherwise. 

\subsection{Image Super Resolution}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}[c]{c | c} 
     \hline
     variable & value \\ [0.5ex] 
     \hline
     learning rate &  0.01\\ 
     
     epochs & 200 \\
     
     batch size & 64 \\
     
     upscale factor & 3 \\
     
     loss & Mean Squared Error \\
     
     seed & 123 \\
     
     $(\alpha \mu^2)_{init}$ of approximate posterior $q_{\theta}(w|\mathcal{D})$ & -10 \\
     
     optimizer & Adam \cite{kingma2014adam} \\
     
     $\lambda$ in $\ell$-2 normalisation & 0.0005 \\
    
     $\beta_i$ & $\frac{2^{M-i}}{2^M-1}$ \cite{blundell2015weight} \\ [1ex] 
     \hline
    \end{tabular} 
    \renewcommand{\arraystretch}{2}
\end{table}


\subsection{Generative Adversarial Network}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}[c]{c | c} 
     \hline
     variable & value \\ [0.5ex] 
     \hline
     learning rate &  0.001\\ 
     
     epochs & 100 \\
     
     batch size & 64 \\
     
     image size & 64 \\
     
     latent vector (nz) & 100 \\
     
     number of generator factor (ndf) & 64 \\
     
     number of discriminator factor (ndf) & 64 \\
     
     upscale factor & 3 \\
     
     loss & Mean Squared Error \\
     
     number of channels (nc) & 3 \\
     
     $(\alpha \mu^2)_{init}$ of approximate posterior $q_{\theta}(w|\mathcal{D})$ & -10 \\
     
     optimizer & Adam \cite{kingma2014adam} \\
     
     $\lambda$ in $\ell$-2 normalisation & 0.0005 \\
    
     $\beta_i$ & $\frac{2^{M-i}}{2^M-1}$ \cite{blundell2015weight} \\ [1ex] 
     \hline
    \end{tabular} 
    \renewcommand{\arraystretch}{2}
\end{table}

\section*{Non Bayesian Settings}

\subsection{Image Recognition}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}[c]{c | c} 
     \hline
     variable & value \\ [0.5ex] 
     \hline
     learning rate &  0.001\\ 
     
     epochs & 100 \\
     
     batch size & 256 \\
     
     loss & cross-entropy \\
     
     initializer & Xavier \cite{glorot2010understanding} or Normal \\
     
     optimizer & Adam \cite{kingma2014adam} \\ [1ex] 
     \hline
    \end{tabular} 
    \renewcommand{\arraystretch}{2}
\end{table}

The weights were initialized with Xavier initialization \cite{glorot2010understanding} at first, but to make it consistent with the Bayesian networks where initialization was Normal initialization (mean = 0 and variance = 1), the initializer was changed to Normal initialization.

\pagebreak

\section*{Architectures}

\subsection{LeNet-5}
 
\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}{c c c c c c} 
     \hline
     layer type & width & stride & padding & input shape & nonlinearity \\ [0.5ex] 
     \hline
     convolution ($5\times5$) & 6 & 1 & 0 & $M\times1\times32\times32$ & Softplus \\ 
     
     Mmax-pooling ($2\times2$) & \empty & 2 & 0 & $M\times6\times28\times28$ & \empty \\
     
     convolution ($5\times5$) & 16 & 1 & 0 & $M\times1\times14\times14$ & Softplus \\
     
     max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times16\times10\times10$ & \empty \\
    
     fully-connected & 120 & \empty & \empty & $M\times400$ & Softplus \\
     
     fully-connected & 84 & \empty & \empty & $M\times120$ & Softplus \\
     
     fully-connected & 10 & \empty & \empty & $M\times84$ & \empty \\ [1ex] 
     \hline
    \end{tabular} 
    \renewcommand{\arraystretch}{1}
    \label{tab:LeNet}
    \caption{LeNet architecture with original configurations as defined in the paper. \cite{lecun1998gradient}}
\label{tab:AlexNet}

\end{table}

\subsection{AlexNet}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}{c c c c c c} 
 \hline
 layer type & width & stride & padding & input shape & nonlinearity \\ [0.5ex] 
 \hline
 convolution ($11\times11$) & 64 & 4 & 5 & $M\times3\times32\times32$ & Softplus \\ 
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times64\times32\times32$ & \empty \\
 
 convolution ($5\times5$) & 192 & 1 & 2 & $M\times64\times15\times15$ & Softplus \\
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times192\times15\times15$ & \empty \\
 
 convolution ($3\times3$) & 384 & 1 & 1 & $M\times192\times7\times7$ & Softplus \\
 
 convolution ($3\times3$) & 256 & 1 & 1 & $M\times384\times7\times7$ & Softplus \\
 
 convolution ($3\times3$) & 128 & 1 & 1 & $M\times256\times7\times7$ & Softplus \\
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times128\times7\times7$ & \empty \\
 
 fully-connected & 128 & \empty & \empty & $M\times128$ & \empty \\ [1ex] 
 \hline
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{AlexNet architecture with original configurations as defined in the paper. \cite{krizhevsky2012imagenet}}
\label{tab:AlexNet}
\end{table}


\subsection{AlexNetHalf}

\begin{table}[H]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}{c c c c c c} 
 \hline
 layer type & width & stride & padding & input shape & nonlinearity \\ [0.5ex] 
 \hline
 convolution ($11\times11$) & 32 & 4 & 5 & $M\times3\times32\times32$ & Softplus \\ 
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times32\times32\times32$ & \empty \\
 
 convolution ($5\times5$) & 96 & 1 & 2 & $M\times32\times15\times15$ & Softplus \\
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times96\times15\times15$ & \empty \\
 
 convolution ($3\times3$) & 192 & 1 & 1 & $M\times96\times7\times7$ & Softplus \\
 
 convolution ($3\times3$) & 128 & 1 & 1 & $M\times192\times7\times7$ & Softplus \\
 
 convolution ($3\times3$) & 64 & 1 & 1 & $M\times128\times7\times7$ & Softplus \\
 
 max-pooling ($2\times2$) & \empty & 2 & 0 & $M\times64\times7\times7$ & \empty \\
 
 fully-connected & 64 & \empty & \empty & $M\times64$ & \empty \\ [1ex] 
 \hline
\end{tabular}
\renewcommand{\arraystretch}{1.5}
\label{tab:AlexNetHalfArchitecture}
\caption{AlexNetHalf with number of filters halved compared to the original architecture.}
\end{table}

